library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
activity <- read.csv(unz("activity.zip", "activity.csv"), colClasses = list(date = "Date"))
# Convert the intervals column into a factor,
# so that during plotting/processing we won't treat them as integer numbers.
# (ie. we make the distance between 1155 and 1200 equal to the distance between 1145 and 1150)
# Also make the interval labels a bit more human-readable.
activity$interval <- factor(sprintf("%04d", activity$interval))
ggplot(summarize(group_by(activity, date), miss_count = sum(is.na(steps))), aes(date, miss_count)) + geom_bar(stat = "identity")
summarize(group_by(activity, date), miss_count = sum(is.na(steps)))$miss_count
## [1] 288 0 0 0 0 0 0 288 0 0 0 0 0 0 0 0 0
## [18] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 288 0 0
## [35] 288 0 0 0 0 288 288 0 0 0 288 0 0 0 0 0 0
## [52] 0 0 0 0 0 0 0 0 0 288
summarize(group_by(activity, interval), miss_count = sum(is.na(steps)))$miss_count
## [1] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [36] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [71] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [106] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [141] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [176] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [211] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [246] 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
## [281] 8 8 8 8 8 8 8 8
ggplot(summarize(group_by(activity, date), s = sum(steps)), aes(date, s)) + geom_line(group = 1)
## Warning: Removed 2 rows containing missing values (geom_path).
sfrm <- summarize(group_by(activity, date), S = sum(steps, na.rm = T), missing = sum(is.na(steps)) == 288)
ggplot(sfrm, aes(date, S)) + geom_line() + geom_point(aes(color = missing), size = 3) + scale_color_manual(values = c("black", "orange"))
ggplot(sfrm, aes(x = S, y = lag(S, 1), color = missing)) + geom_point(size = 3) + scale_color_manual(values = c("black", "orange")) + stat_smooth(method = lm)
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
ggplot(sfrm, aes(x = S, y = lag(S, 1))) + geom_point(size = 3) + scale_color_manual(values = c("black", "orange")) + stat_smooth(method = lm)
## Warning: Removed 1 rows containing missing values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
sort(sfrm$S)
## [1] 0 0 0 0 0 0 0 0 41 126 2492
## [12] 3219 4472 5018 5441 6778 7047 7336 8334 8355 8821 8841
## [23] 8918 9819 9900 10056 10119 10139 10183 10304 10395 10439 10571
## [34] 10600 10765 11015 11162 11352 11458 11829 11834 12116 12426 12608
## [45] 12787 12811 12883 13294 13452 13460 13646 14339 14478 15084 15098
## [56] 15110 15414 15420 17382 20427 21194
sfrm$low.or.0 <- sfrm$S <= 126
M <- mean(sfrm$S[!sfrm$low.or.0])
sfrm$imp <- ifelse(sfrm$low.or.0, M, sfrm$S)
ggplot(sfrm, aes(date, imp)) + geom_line() + geom_point(aes(color = missing), size = 3) + scale_color_manual(values = c("black", "orange"))
Interploation would we nicer maybe, but oh well…
plot(activity$step)
activity$timestamp <- as.POSIXct(strptime(paste(activity$date, activity$interval), format = "%Y-%m-%d %H%M", tz = "UTC"))
ggplot(activity, aes(timestamp, steps)) + geom_line()
## Warning: Removed 576 rows containing missing values (geom_path).
arrange(filter(sfrm, low.or.0), date)
## Source: local data frame [10 x 5]
##
## date S missing low.or.0 imp
## 1 2012-10-01 0 TRUE TRUE 11185.12
## 2 2012-10-02 126 FALSE TRUE 11185.12
## 3 2012-10-08 0 TRUE TRUE 11185.12
## 4 2012-11-01 0 TRUE TRUE 11185.12
## 5 2012-11-04 0 TRUE TRUE 11185.12
## 6 2012-11-09 0 TRUE TRUE 11185.12
## 7 2012-11-10 0 TRUE TRUE 11185.12
## 8 2012-11-14 0 TRUE TRUE 11185.12
## 9 2012-11-15 41 FALSE TRUE 11185.12
## 10 2012-11-30 0 TRUE TRUE 11185.12
frm <- filter(activity, date %in% as.Date(c("2012-10-01", "2012-10-02", "2012-10-03")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_path).
## Warning: Removed 288 rows containing missing values (geom_point).
frm <- filter(activity, date %in% as.Date(c("2012-10-07", "2012-10-08", "2012-10-09")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_point).
frm <- filter(activity, date %in% as.Date(c("2012-10-31", "2012-11-01", "2012-11-02", "2012-11-03", "2012-11-04", "2012-11-05")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 576 rows containing missing values (geom_point).
frm <- filter(activity, date %in% as.Date(c("2012-11-08", "2012-11-09", "2012-11-10", "2012-11-11", "2012-11-12")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 576 rows containing missing values (geom_point).
frm <- filter(activity, date %in% as.Date(c("2012-11-13", "2012-11-14", "2012-11-15", "2012-11-16")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_point).
frm <- filter(activity, date %in% as.Date(c("2012-11-28", "2012-11-29", "2012-11-30")))
ggplot(frm, aes(timestamp, steps)) + geom_line() + geom_point()
## Warning: Removed 288 rows containing missing values (geom_path).
## Warning: Removed 288 rows containing missing values (geom_point).
# Helper dataset for missing-value analysis
activity.missings <- summarize(group_by(activity, date),
total_steps = sum(steps),
na_count = sum(is.na(steps)),
non_zero_count = sum(steps != 0))
# Add some helper columns and backfill NA's for plotting
activity.missings$na.or.low <- (activity.missings$na_count == 288 | activity.missings$non_zero_count <= 2)
activity.missings[is.na(activity.missings)] <- 0
activity.missings$is.monday <- weekdays(activity.missings$date) == "Monday"
invalid.days <- activity.missings$date[activity.missings$na.or.low]
summarize(group_by(activity, interval), avg_steps = mean(steps, na.rm = TRUE))
## Source: local data frame [288 x 2]
##
## interval avg_steps
## 1 0000 1.7169811
## 2 0005 0.3396226
## 3 0010 0.1320755
## 4 0015 0.1509434
## 5 0020 0.0754717
## 6 0025 2.0943396
## 7 0030 0.5283019
## 8 0035 0.8679245
## 9 0040 0.0000000
## 10 0045 1.4716981
## .. ... ...
hist(summarize(group_by(activity, interval), avg_steps = mean(steps, na.rm = TRUE))$avg_steps)
summarize(group_by(filter(activity, !date %in% invalid.days), interval), avg_steps = mean(steps))
## Source: local data frame [288 x 2]
##
## interval avg_steps
## 1 0000 1.78431373
## 2 0005 0.35294118
## 3 0010 0.13725490
## 4 0015 0.15686275
## 5 0020 0.07843137
## 6 0025 2.17647059
## 7 0030 0.54901961
## 8 0035 0.90196078
## 9 0040 0.00000000
## 10 0045 1.52941176
## .. ... ...
hist(summarize(group_by(filter(activity, !date %in% invalid.days), interval), avg_steps = mean(steps))$avg_steps)